{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import svm, datasets\n",
    "from spark_sklearn import GridSearchCV     #对估计器的指定参数值进行穷举搜索。\n",
    "from spark_sklearn.util import createLocalSparkSession\n",
    "spark = createLocalSparkSession()\n",
    "sc = spark.sparkContext\n",
    "iris = datasets.load_iris()\n",
    "parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}\n",
    "svr = svm.SVC(gamma='auto')\n",
    "clf = GridSearchCV(sc, svr, parameters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark import SparkContext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on function createLocalSparkSession in module spark_sklearn.util:\n",
      "\n",
      "createLocalSparkSession(appName='spark-sklearn')\n",
      "    Generates a :class:`SparkSession` utilizing all local cores\n",
      "    with the progress bar disabled but otherwise default config.\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "spark_sklearn.grid_search.GridSearchCV"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf_modle=clf.fit(iris.data, iris.target)\n",
    "help(createLocalSparkSession)\n",
    "type(clf_modle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "spark_sklearn.grid_search.GridSearchCV"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(clf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['mean_fit_time',\n",
       " 'mean_score_time',\n",
       " 'mean_test_score',\n",
       " 'mean_train_score',\n",
       " 'param_C',\n",
       " 'param_kernel',\n",
       " 'params',\n",
       " 'rank_test_score',\n",
       " 'split0_test_score',\n",
       " 'split0_train_score',\n",
       " 'split1_test_score',\n",
       " 'split1_train_score',\n",
       " 'split2_test_score',\n",
       " 'split2_train_score',\n",
       " 'std_fit_time',\n",
       " 'std_score_time',\n",
       " 'std_test_score',\n",
       " 'std_train_score']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(clf.cv_results_.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'mean_fit_time': array([0.00066662, 0.00099993, 0.00066662, 0.00066678]),\n",
       " 'mean_score_time': array([0.04266667, 0.        , 0.00099993, 0.0006667 ]),\n",
       " 'mean_test_score': array([0.98      , 0.97333333, 0.97333333, 0.98      ]),\n",
       " 'mean_train_score': array([0.98999802, 0.98336304, 0.97999604, 0.97999604]),\n",
       " 'param_C': masked_array(data=[1, 1, 10, 10],\n",
       "              mask=[False, False, False, False],\n",
       "        fill_value='?',\n",
       "             dtype=object),\n",
       " 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf'],\n",
       "              mask=[False, False, False, False],\n",
       "        fill_value='?',\n",
       "             dtype=object),\n",
       " 'params': ({'C': 1, 'kernel': 'linear'},\n",
       "  {'C': 1, 'kernel': 'rbf'},\n",
       "  {'C': 10, 'kernel': 'linear'},\n",
       "  {'C': 10, 'kernel': 'rbf'}),\n",
       " 'rank_test_score': array([1, 3, 3, 1]),\n",
       " 'split0_test_score': array([1.        , 0.98039216, 1.        , 0.98039216]),\n",
       " 'split0_train_score': array([0.97979798, 0.96969697, 0.95959596, 0.95959596]),\n",
       " 'split1_test_score': array([0.96078431, 0.96078431, 0.92156863, 0.96078431]),\n",
       " 'split1_train_score': array([1., 1., 1., 1.]),\n",
       " 'split2_test_score': array([0.97916667, 0.97916667, 1.        , 1.        ]),\n",
       " 'split2_train_score': array([0.99019608, 0.98039216, 0.98039216, 0.98039216]),\n",
       " 'std_fit_time': array([0.00047137, 0.        , 0.00047137, 0.00047148]),\n",
       " 'std_score_time': array([0.05753458, 0.        , 0.        , 0.00047143]),\n",
       " 'std_test_score': array([0.01617914, 0.00902067, 0.03715363, 0.01592466]),\n",
       " 'std_train_score': array([0.00824863, 0.01254825, 0.01649726, 0.01649726])}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.cv_results_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',\n",
       "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(clf.cv_results_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd_data = pd.DataFrame(clf.cv_results_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean_fit_time</th>\n",
       "      <th>mean_score_time</th>\n",
       "      <th>mean_test_score</th>\n",
       "      <th>mean_train_score</th>\n",
       "      <th>param_C</th>\n",
       "      <th>param_kernel</th>\n",
       "      <th>params</th>\n",
       "      <th>rank_test_score</th>\n",
       "      <th>split0_test_score</th>\n",
       "      <th>split0_train_score</th>\n",
       "      <th>split1_test_score</th>\n",
       "      <th>split1_train_score</th>\n",
       "      <th>split2_test_score</th>\n",
       "      <th>split2_train_score</th>\n",
       "      <th>std_fit_time</th>\n",
       "      <th>std_score_time</th>\n",
       "      <th>std_test_score</th>\n",
       "      <th>std_train_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.000667</td>\n",
       "      <td>0.042667</td>\n",
       "      <td>0.980000</td>\n",
       "      <td>0.989998</td>\n",
       "      <td>1</td>\n",
       "      <td>linear</td>\n",
       "      <td>{u'kernel': u'linear', u'C': 1}</td>\n",
       "      <td>1</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.979798</td>\n",
       "      <td>0.960784</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.979167</td>\n",
       "      <td>0.990196</td>\n",
       "      <td>0.000471</td>\n",
       "      <td>0.057535</td>\n",
       "      <td>0.016179</td>\n",
       "      <td>0.008249</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.001000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.973333</td>\n",
       "      <td>0.983363</td>\n",
       "      <td>1</td>\n",
       "      <td>rbf</td>\n",
       "      <td>{u'kernel': u'rbf', u'C': 1}</td>\n",
       "      <td>3</td>\n",
       "      <td>0.980392</td>\n",
       "      <td>0.969697</td>\n",
       "      <td>0.960784</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.979167</td>\n",
       "      <td>0.980392</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.009021</td>\n",
       "      <td>0.012548</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000667</td>\n",
       "      <td>0.001000</td>\n",
       "      <td>0.973333</td>\n",
       "      <td>0.979996</td>\n",
       "      <td>10</td>\n",
       "      <td>linear</td>\n",
       "      <td>{u'kernel': u'linear', u'C': 10}</td>\n",
       "      <td>3</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.959596</td>\n",
       "      <td>0.921569</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.980392</td>\n",
       "      <td>0.000471</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.037154</td>\n",
       "      <td>0.016497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000667</td>\n",
       "      <td>0.000667</td>\n",
       "      <td>0.980000</td>\n",
       "      <td>0.979996</td>\n",
       "      <td>10</td>\n",
       "      <td>rbf</td>\n",
       "      <td>{u'kernel': u'rbf', u'C': 10}</td>\n",
       "      <td>1</td>\n",
       "      <td>0.980392</td>\n",
       "      <td>0.959596</td>\n",
       "      <td>0.960784</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.980392</td>\n",
       "      <td>0.000471</td>\n",
       "      <td>0.000471</td>\n",
       "      <td>0.015925</td>\n",
       "      <td>0.016497</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   mean_fit_time  mean_score_time  mean_test_score  mean_train_score param_C  \\\n",
       "0       0.000667         0.042667         0.980000          0.989998       1   \n",
       "1       0.001000         0.000000         0.973333          0.983363       1   \n",
       "2       0.000667         0.001000         0.973333          0.979996      10   \n",
       "3       0.000667         0.000667         0.980000          0.979996      10   \n",
       "\n",
       "  param_kernel                            params  rank_test_score  \\\n",
       "0       linear   {u'kernel': u'linear', u'C': 1}                1   \n",
       "1          rbf      {u'kernel': u'rbf', u'C': 1}                3   \n",
       "2       linear  {u'kernel': u'linear', u'C': 10}                3   \n",
       "3          rbf     {u'kernel': u'rbf', u'C': 10}                1   \n",
       "\n",
       "   split0_test_score  split0_train_score  split1_test_score  \\\n",
       "0           1.000000            0.979798           0.960784   \n",
       "1           0.980392            0.969697           0.960784   \n",
       "2           1.000000            0.959596           0.921569   \n",
       "3           0.980392            0.959596           0.960784   \n",
       "\n",
       "   split1_train_score  split2_test_score  split2_train_score  std_fit_time  \\\n",
       "0                 1.0           0.979167            0.990196      0.000471   \n",
       "1                 1.0           0.979167            0.980392      0.000000   \n",
       "2                 1.0           1.000000            0.980392      0.000471   \n",
       "3                 1.0           1.000000            0.980392      0.000471   \n",
       "\n",
       "   std_score_time  std_test_score  std_train_score  \n",
       "0        0.057535        0.016179         0.008249  \n",
       "1        0.000000        0.009021         0.012548  \n",
       "2        0.000000        0.037154         0.016497  \n",
       "3        0.000471        0.015925         0.016497  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on function createLocalSparkSession in module spark_sklearn.util:\n",
      "\n",
      "createLocalSparkSession(appName='spark-sklearn')\n",
      "    Generates a :class:`SparkSession` utilizing all local cores\n",
      "    with the progress bar disabled but otherwise default config.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(createLocalSparkSession)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression as SKL_LogisticRegression\n",
    "from sklearn.linear_model import LinearRegression as SKL_LinearRegression\n",
    "from pyspark.ml.regression import LinearRegression, LinearRegressionModel\n",
    "from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from spark_sklearn import Converter\n",
    "converter = Converter(sc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "skl_lr = SKL_LinearRegression().fit(iris.data, iris.target)\n",
    "lr = converter.toSpark(skl_lr)       #scikit-learn模型转化为PySpark ML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.ml.regression.LinearRegressionModel"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(lr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sklearn.linear_model.base.LinearRegression"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(skl_lr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_skl_lr = converter.toSKLearn(lr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sklearn.linear_model.base.LinearRegression"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(new_skl_lr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
